{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import re\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_string(string):\n",
    "    string = re.sub('[^A-Za-z0-9\\-\\/ ]+', ' ', string).split()\n",
    "    return [y.strip() for y in string]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def parse_raw(filename):\n",
    "    with open(filename, 'r') as fopen:\n",
    "        entities = fopen.read()\n",
    "    soup = BeautifulSoup(entities, 'html.parser')\n",
    "    inside_tag = ''\n",
    "    texts, labels = [], []\n",
    "    for sentence in soup.prettify().split('\\n'):\n",
    "        if len(inside_tag):\n",
    "            splitted = process_string(sentence)\n",
    "            texts += splitted\n",
    "            labels += [inside_tag] * len(splitted)\n",
    "            inside_tag = ''\n",
    "        else:\n",
    "            if not sentence.find('</'):\n",
    "                pass\n",
    "            elif not sentence.find('<'):\n",
    "                inside_tag = sentence.split('>')[0][1:]\n",
    "            else:\n",
    "                splitted = process_string(sentence)\n",
    "                texts += splitted\n",
    "                labels += ['OTHER'] * len(splitted)\n",
    "    assert (len(texts)==len(labels)), \"length texts and labels are not same\"\n",
    "    print('len texts and labels: ', len(texts))\n",
    "    return texts,labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len texts and labels:  34012\n"
     ]
    }
   ],
   "source": [
    "train_texts, train_labels = parse_raw('data_train.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "len texts and labels:  9249\n"
     ]
    }
   ],
   "source": [
    "test_texts, test_labels = parse_raw('data_test.txt')\n",
    "train_texts += test_texts\n",
    "train_labels += test_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'location', 'organization', 'person', 'quantity', 'time'],\n",
       "       dtype='<U12'), array([35613,  1536,  1592,  2358,  1336,   826]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_labels,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('entities-bm-normalize-v3.txt','r') as fopen:\n",
    "    entities_bm = fopen.read().split('\\n')[:-1]\n",
    "entities_bm = [i.split() for i in entities_bm]\n",
    "entities_bm = [[i[0],'TIME' if i[0] in 'jam' else i[1]] for i in entities_bm]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'KN'\n",
      "'KA'\n"
     ]
    }
   ],
   "source": [
    "replace_by = {'LOC':'location','PRN':'person','NORP':'organization','ORG':'organization','LAW':'law',\n",
    "             'EVENT':'OTHER','FAC':'organization','TIME':'time','O':'OTHER','ART':'person','DOC':'law'}\n",
    "for i in entities_bm:\n",
    "    try:\n",
    "        string = process_string(i[0])\n",
    "        if len(string):\n",
    "            train_labels.append(replace_by[i[1]])\n",
    "            train_texts.append(process_string(i[0])[0])  \n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "        \n",
    "assert (len(train_texts)==len(train_labels)), \"length texts and labels are not same\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['OTHER', 'law', 'location', 'organization', 'person', 'quantity',\n",
       "        'time'], dtype='<U12'),\n",
       " array([47406,   107,  2010,  2435,  3913,  1336,  1240]))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_labels,return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(58447, 21197)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target = LabelEncoder().fit_transform(train_labels)\n",
    "bow_chars = CountVectorizer(ngram_range=(2, 4), analyzer='char',lowercase=False).fit(train_texts)\n",
    "vectors = bow_chars.transform(train_texts)\n",
    "vectors.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = 0.2)\n",
    "del vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       OTHER       0.95      0.93      0.94     37870\n",
      "         law       0.81      0.33      0.47        87\n",
      "    location       0.68      0.73      0.70      1613\n",
      "organization       0.53      0.70      0.61      1957\n",
      "      person       0.74      0.83      0.78      3174\n",
      "    quantity       0.61      0.42      0.50      1094\n",
      "        time       0.69      0.66      0.67       962\n",
      "\n",
      " avg / total       0.90      0.89      0.89     46757\n",
      "\n"
     ]
    }
   ],
   "source": [
    "multinomial = MultinomialNB().fit(train_X, train_Y)\n",
    "print(metrics.classification_report(train_Y, multinomial.predict(train_X), target_names = np.unique(train_labels)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "       OTHER       0.95      0.93      0.94      9536\n",
      "         law       0.56      0.25      0.34        20\n",
      "    location       0.60      0.67      0.63       397\n",
      "organization       0.46      0.62      0.53       478\n",
      "      person       0.66      0.75      0.70       739\n",
      "    quantity       0.47      0.33      0.39       242\n",
      "        time       0.69      0.59      0.64       278\n",
      "\n",
      " avg / total       0.88      0.87      0.88     11690\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(metrics.classification_report(test_Y, multinomial.predict(test_X), target_names = np.unique(train_labels)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('multinomial-entities.pkl','wb') as fopen:\n",
    "    pickle.dump(multinomial,fopen)\n",
    "with open('bow-entities.pkl','wb') as fopen:\n",
    "    pickle.dump(bow_chars,fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
